In [26]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as offline
import cufflinks as cf
py.init_notebook_mode(connected=True)
init_notebook_mode(connected=True)
offline.init_notebook_mode()
cf.go_offline()

Data

In [27]:
heloc = pd.read_excel('heloc_featureset_final.xlsx')
display(heloc.head(n=5))
RiskPerformance MSinceOldestTradeOpen AverageMInFile NetFractionRevolvingBurden NetFractionInstallBurden NumBank2NatlTradesWHighUtilization NumSatisfactoryTrades NumTrades60Ever2DerogPubRec MSinceMostRecentInqexcl7days
0 Bad 144 84 33 -8 1 20 3 0
1 Bad 58 41 0 -8 -8 2 4 0
2 Bad 66 24 53 66 1 9 0 0
3 Bad 169 73 72 83 3 28 1 0
4 Bad 333 132 51 89 0 12 0 0
In [28]:
riskperf_raw = heloc['RiskPerformance']
features = heloc.drop('RiskPerformance' , axis = 1)
dic={'Good':0, 'Bad':1}
riskperf = riskperf_raw.map(dic)
display(riskperf.head())
display(features.head().transpose())
0    1
1    1
2    1
3    1
4    1
Name: RiskPerformance, dtype: int64
0 1 2 3 4
MSinceOldestTradeOpen 144 58 66 169 333
AverageMInFile 84 41 24 73 132
NetFractionRevolvingBurden 33 0 53 72 51
NetFractionInstallBurden -8 -8 66 83 89
NumBank2NatlTradesWHighUtilization 1 -8 1 3 0
NumSatisfactoryTrades 20 2 9 28 12
NumTrades60Ever2DerogPubRec 3 4 0 1 0
MSinceMostRecentInqexcl7days 0 0 0 0 0
In [29]:
from sklearn.model_selection import train_test_split

cols = ['Model','Dataset','Best Params', 'File Name', 'True Positive', 'False Positive', 'True Negative', 'False Negative', 'Sensitivity', 'Specificity', 'G_Mean']
models_report = pd.DataFrame(columns = cols)
best_models = {}

X_train, X_test, y_train, y_test = train_test_split(features, riskperf, test_size=0.2, random_state=0)

Scorer

In [30]:
from gmean_score import Gscore
from sklearn.metrics import make_scorer
def scre(y_true, y_pred):
    scre = Gscore(y_true, y_pred)
    return scre.g_mean()   

Model Training & Optimisation

Tuning using StratifiedShuffleSplit and GridSearchCV

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
import time
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
In [32]:
def optimise_model(model, model_param, X_train, y_train, data, file): 
    rskfold = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=2)
    print('='*40)
    print('Computing {} '.format(model.__class__.__name__))
    print('='*40)
    clf = GridSearchCV(estimator=model, cv=rskfold, scoring= make_scorer(scre), return_train_score=True,
              param_grid= model_param) 
    print('Fitting...')
    start = time.time() # Get start time
    clf.fit(X_train, y_train.values)
    end = time.time() # Get end time
    training_time = end-start
    print("Training time: ", training_time)
    print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    params = clf.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("Mean score of %f with std dev of (%f) using params: %r" % (mean, stdev, param))
    print("\nBest estimator: " + str(clf.best_estimator_))
    y_train_pred = clf.best_estimator_.predict(X_train)
    scr = Gscore(y_train.values, y_train_pred)
    tmp = pd.Series({'Model': model.__class__.__name__,
                     'Dataset': data,
                     'Best Params': clf.best_params_,
                     'File Name': file, 
                        'True Positive' : scr.TP,
                        'False Positive' : scr.FP,
                        'True Negative' : scr.TN,
                        'False Negative' : scr.FN,
                        'Sensitivity': round(scr.sensi(),3),
                        'Specificity': round(scr.speci(),3),
                        'G_Mean': round(scr.g_mean(),3)})
          
    return clf.best_estimator_, pred_outcome(X_train,y_train,y_train_pred), tmp

Testing using cross_val_score

In [33]:
def model_predict(model, X_test, y_test, data, file):
    
    rskfold = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=2)
    print('='*40)
    print('Computing {} '.format(model.__class__.__name__))
    print('='*40)
    print('\nPredicting...')
    start = time.time() # Get start time
    y_test_pred = model.predict(X_test)
    end = time.time() # Get end time
    training_time = end-start
    print("Prediction time: ", training_time)
    scr = Gscore(y_test.values, y_test_pred)
    results = cross_val_score(model, X_test, y_test.values, cv=rskfold, scoring=make_scorer(scre))
    #print(results)
    print("10-fold cross validation average G-mean: %.3f" % (results.mean()))
    tmp = pd.Series({'Model': model.__class__.__name__,
                     'Dataset': data,
                     'Best Params': model.get_params(),
                     'File Name': file, 
                        'True Positive' : scr.TP,
                        'False Positive' : scr.FP,
                        'True Negative' : scr.TN,
                        'False Negative' : scr.FN,
                        'Sensitivity': round(scr.sensi(),3),
                        'Specificity': round(scr.speci(),3),
                        'G_Mean': round(scr.g_mean(),3)})
    return pred_outcome(X_test,y_test,y_test_pred), tmp

Predicting Outcome

In [34]:
    def pred_outcome(X,y_actual,y_pred):
        # Merge actual with predicted outcomes
        df1 = pd.DataFrame(data=y_actual.to_frame())
        df1['RiskPerformance_Pred'] = y_pred.tolist()
        
        # Classify prediction outcomes
        df1['Prediction'] = 'Correct'
        df1.loc[df1['RiskPerformance']> df1['RiskPerformance_Pred'],'Prediction'] = 'Incorrect: Bad Loan' #bad loan classified as good
        df1.loc[df1['RiskPerformance']< df1['RiskPerformance_Pred'],'Prediction'] = 'Incorrect: Good Loan' #good loan classified as bad
        
        #Visualisation of prediction
        temp = df1['Prediction'].value_counts()
        df = pd.DataFrame({'Labels': temp.index,'Counts': temp.values})
        print('\nPrediction Outcome :\n',df)
        df.iplot(kind='pie',labels='Labels',values='Counts', title='Prediction Breakdown')
        return X.join(df1)

Supervised Models

In [35]:
clfs = {'LogReg' : LogisticRegression(),
            'TreeClass': DecisionTreeClassifier(),
            'GradBoost': GradientBoostingClassifier()
            }
clfs_params = {'LogReg' : {'C':np.arange(1e-05, 0.5, 0.01), 'penalty': ['l1','l2']} ,
            'TreeClass': {'max_depth':np.arange(2, 21, 4), 'min_samples_leaf':np.arange(50, 301, 50), 'min_samples_split': np.arange(100, 501, 100) },
             'GradBoost':{'learning_rate':(0.01, 0.21, 0.1) ,'n_estimators' :np.arange(100, 501, 200),'max_depth':np.arange(2, 10, 3),'min_samples_leaf':np.arange(50, 301, 100)}
            }
    
#

Benchmark Model: Logistic Regression

Model Tuning

In [36]:
data = 'Training'
model= 'LogReg'
filename = model+'_'+data+ time.strftime("%Y%m%d-%H%M%S")+ '.xlsx'
best_model, prediction_outcome, model_report= optimise_model(clfs[model],clfs_params[model],X_train, y_train, data, filename)
   
best_models[model] = best_model
prediction_outcome.to_excel(filename)
models_report = models_report.append(model_report, ignore_index = True)
========================================
Computing LogisticRegression 
========================================
Fitting...
Training time:  92.33294916152954
Best: 0.696697 using {'penalty': 'l1', 'C': 0.040010000000000004}
Mean score of 0.000000 with std dev of (0.000000) using params: {'penalty': 'l1', 'C': 1.0000000000000001e-05}
Mean score of 0.671510 with std dev of (0.012113) using params: {'penalty': 'l2', 'C': 1.0000000000000001e-05}
Mean score of 0.695781 with std dev of (0.010870) using params: {'penalty': 'l1', 'C': 0.01001}
Mean score of 0.696107 with std dev of (0.010092) using params: {'penalty': 'l2', 'C': 0.01001}
Mean score of 0.695337 with std dev of (0.011165) using params: {'penalty': 'l1', 'C': 0.02001}
Mean score of 0.695768 with std dev of (0.008795) using params: {'penalty': 'l2', 'C': 0.02001}
Mean score of 0.695592 with std dev of (0.010379) using params: {'penalty': 'l1', 'C': 0.030009999999999998}
Mean score of 0.695582 with std dev of (0.009073) using params: {'penalty': 'l2', 'C': 0.030009999999999998}
Mean score of 0.696697 with std dev of (0.010147) using params: {'penalty': 'l1', 'C': 0.040010000000000004}
Mean score of 0.695361 with std dev of (0.008701) using params: {'penalty': 'l2', 'C': 0.040010000000000004}
Mean score of 0.696020 with std dev of (0.009760) using params: {'penalty': 'l1', 'C': 0.050010000000000006}
Mean score of 0.695074 with std dev of (0.008480) using params: {'penalty': 'l2', 'C': 0.050010000000000006}
Mean score of 0.695987 with std dev of (0.009007) using params: {'penalty': 'l1', 'C': 0.060010000000000001}
Mean score of 0.694671 with std dev of (0.008902) using params: {'penalty': 'l2', 'C': 0.060010000000000001}
Mean score of 0.695682 with std dev of (0.008987) using params: {'penalty': 'l1', 'C': 0.070010000000000003}
Mean score of 0.694598 with std dev of (0.008997) using params: {'penalty': 'l2', 'C': 0.070010000000000003}
Mean score of 0.695710 with std dev of (0.008785) using params: {'penalty': 'l1', 'C': 0.080009999999999998}
Mean score of 0.694524 with std dev of (0.008909) using params: {'penalty': 'l2', 'C': 0.080009999999999998}
Mean score of 0.695505 with std dev of (0.009091) using params: {'penalty': 'l1', 'C': 0.090009999999999993}
Mean score of 0.694644 with std dev of (0.008991) using params: {'penalty': 'l2', 'C': 0.090009999999999993}
Mean score of 0.695356 with std dev of (0.008743) using params: {'penalty': 'l1', 'C': 0.10001}
Mean score of 0.694511 with std dev of (0.008954) using params: {'penalty': 'l2', 'C': 0.10001}
Mean score of 0.695217 with std dev of (0.008520) using params: {'penalty': 'l1', 'C': 0.11001}
Mean score of 0.694439 with std dev of (0.008795) using params: {'penalty': 'l2', 'C': 0.11001}
Mean score of 0.695215 with std dev of (0.008677) using params: {'penalty': 'l1', 'C': 0.12000999999999999}
Mean score of 0.694431 with std dev of (0.008720) using params: {'penalty': 'l2', 'C': 0.12000999999999999}
Mean score of 0.695020 with std dev of (0.008830) using params: {'penalty': 'l1', 'C': 0.13001000000000001}
Mean score of 0.694425 with std dev of (0.008729) using params: {'penalty': 'l2', 'C': 0.13001000000000001}
Mean score of 0.694744 with std dev of (0.008898) using params: {'penalty': 'l1', 'C': 0.14001000000000002}
Mean score of 0.694299 with std dev of (0.008885) using params: {'penalty': 'l2', 'C': 0.14001000000000002}
Mean score of 0.694741 with std dev of (0.008931) using params: {'penalty': 'l1', 'C': 0.15001}
Mean score of 0.694238 with std dev of (0.008947) using params: {'penalty': 'l2', 'C': 0.15001}
Mean score of 0.694603 with std dev of (0.008840) using params: {'penalty': 'l1', 'C': 0.16001000000000001}
Mean score of 0.694238 with std dev of (0.008947) using params: {'penalty': 'l2', 'C': 0.16001000000000001}
Mean score of 0.694465 with std dev of (0.008989) using params: {'penalty': 'l1', 'C': 0.17001000000000002}
Mean score of 0.694107 with std dev of (0.009027) using params: {'penalty': 'l2', 'C': 0.17001000000000002}
Mean score of 0.694402 with std dev of (0.008771) using params: {'penalty': 'l1', 'C': 0.18001}
Mean score of 0.694168 with std dev of (0.008977) using params: {'penalty': 'l2', 'C': 0.18001}
Mean score of 0.694462 with std dev of (0.008755) using params: {'penalty': 'l1', 'C': 0.19001000000000001}
Mean score of 0.694099 with std dev of (0.009141) using params: {'penalty': 'l2', 'C': 0.19001000000000001}
Mean score of 0.694268 with std dev of (0.008727) using params: {'penalty': 'l1', 'C': 0.20001000000000002}
Mean score of 0.694101 with std dev of (0.008993) using params: {'penalty': 'l2', 'C': 0.20001000000000002}
Mean score of 0.694260 with std dev of (0.008713) using params: {'penalty': 'l1', 'C': 0.21001}
Mean score of 0.694161 with std dev of (0.009092) using params: {'penalty': 'l2', 'C': 0.21001}
Mean score of 0.694260 with std dev of (0.008713) using params: {'penalty': 'l1', 'C': 0.22001000000000001}
Mean score of 0.694222 with std dev of (0.009064) using params: {'penalty': 'l2', 'C': 0.22001000000000001}
Mean score of 0.694320 with std dev of (0.008774) using params: {'penalty': 'l1', 'C': 0.23001000000000002}
Mean score of 0.694082 with std dev of (0.009124) using params: {'penalty': 'l2', 'C': 0.23001000000000002}
Mean score of 0.694382 with std dev of (0.008849) using params: {'penalty': 'l1', 'C': 0.24001}
Mean score of 0.694018 with std dev of (0.009052) using params: {'penalty': 'l2', 'C': 0.24001}
Mean score of 0.694322 with std dev of (0.008949) using params: {'penalty': 'l1', 'C': 0.25001000000000001}
Mean score of 0.694018 with std dev of (0.009052) using params: {'penalty': 'l2', 'C': 0.25001000000000001}
Mean score of 0.694442 with std dev of (0.008833) using params: {'penalty': 'l1', 'C': 0.26001000000000002}
Mean score of 0.694084 with std dev of (0.009027) using params: {'penalty': 'l2', 'C': 0.26001000000000002}
Mean score of 0.694504 with std dev of (0.008798) using params: {'penalty': 'l1', 'C': 0.27001000000000003}
Mean score of 0.694017 with std dev of (0.008980) using params: {'penalty': 'l2', 'C': 0.27001000000000003}
Mean score of 0.694564 with std dev of (0.008699) using params: {'penalty': 'l1', 'C': 0.28001000000000004}
Mean score of 0.694017 with std dev of (0.008980) using params: {'penalty': 'l2', 'C': 0.28001000000000004}
Mean score of 0.694437 with std dev of (0.008810) using params: {'penalty': 'l1', 'C': 0.29000999999999999}
Mean score of 0.694017 with std dev of (0.008980) using params: {'penalty': 'l2', 'C': 0.29000999999999999}
Mean score of 0.694372 with std dev of (0.008861) using params: {'penalty': 'l1', 'C': 0.30001}
Mean score of 0.693950 with std dev of (0.008870) using params: {'penalty': 'l2', 'C': 0.30001}
Mean score of 0.694300 with std dev of (0.008986) using params: {'penalty': 'l1', 'C': 0.31001000000000001}
Mean score of 0.694010 with std dev of (0.008775) using params: {'penalty': 'l2', 'C': 0.31001000000000001}
Mean score of 0.694300 with std dev of (0.008986) using params: {'penalty': 'l1', 'C': 0.32001000000000002}
Mean score of 0.694010 with std dev of (0.008775) using params: {'penalty': 'l2', 'C': 0.32001000000000002}
Mean score of 0.694300 with std dev of (0.008986) using params: {'penalty': 'l1', 'C': 0.33001000000000003}
Mean score of 0.694010 with std dev of (0.008775) using params: {'penalty': 'l2', 'C': 0.33001000000000003}
Mean score of 0.694234 with std dev of (0.009017) using params: {'penalty': 'l1', 'C': 0.34001000000000003}
Mean score of 0.694131 with std dev of (0.008759) using params: {'penalty': 'l2', 'C': 0.34001000000000003}
Mean score of 0.694172 with std dev of (0.008943) using params: {'penalty': 'l1', 'C': 0.35001000000000004}
Mean score of 0.694131 with std dev of (0.008759) using params: {'penalty': 'l2', 'C': 0.35001000000000004}
Mean score of 0.694233 with std dev of (0.008913) using params: {'penalty': 'l1', 'C': 0.36001}
Mean score of 0.694065 with std dev of (0.008814) using params: {'penalty': 'l2', 'C': 0.36001}
Mean score of 0.694100 with std dev of (0.008982) using params: {'penalty': 'l1', 'C': 0.37001000000000001}
Mean score of 0.694065 with std dev of (0.008814) using params: {'penalty': 'l2', 'C': 0.37001000000000001}
Mean score of 0.694100 with std dev of (0.008982) using params: {'penalty': 'l1', 'C': 0.38001000000000001}
Mean score of 0.694000 with std dev of (0.008839) using params: {'penalty': 'l2', 'C': 0.38001000000000001}
Mean score of 0.694040 with std dev of (0.009042) using params: {'penalty': 'l1', 'C': 0.39001000000000002}
Mean score of 0.694000 with std dev of (0.008839) using params: {'penalty': 'l2', 'C': 0.39001000000000002}
Mean score of 0.694101 with std dev of (0.008993) using params: {'penalty': 'l1', 'C': 0.40001000000000003}
Mean score of 0.694000 with std dev of (0.008839) using params: {'penalty': 'l2', 'C': 0.40001000000000003}
Mean score of 0.694101 with std dev of (0.008993) using params: {'penalty': 'l1', 'C': 0.41001000000000004}
Mean score of 0.694000 with std dev of (0.008839) using params: {'penalty': 'l2', 'C': 0.41001000000000004}
Mean score of 0.694162 with std dev of (0.008965) using params: {'penalty': 'l1', 'C': 0.42000999999999999}
Mean score of 0.694000 with std dev of (0.008839) using params: {'penalty': 'l2', 'C': 0.42000999999999999}
Mean score of 0.694162 with std dev of (0.008965) using params: {'penalty': 'l1', 'C': 0.43001}
Mean score of 0.694000 with std dev of (0.008839) using params: {'penalty': 'l2', 'C': 0.43001}
Mean score of 0.694095 with std dev of (0.008855) using params: {'penalty': 'l1', 'C': 0.44001000000000001}
Mean score of 0.693940 with std dev of (0.008745) using params: {'penalty': 'l2', 'C': 0.44001000000000001}
Mean score of 0.694095 with std dev of (0.008855) using params: {'penalty': 'l1', 'C': 0.45001000000000002}
Mean score of 0.693940 with std dev of (0.008745) using params: {'penalty': 'l2', 'C': 0.45001000000000002}
Mean score of 0.694095 with std dev of (0.008855) using params: {'penalty': 'l1', 'C': 0.46001000000000003}
Mean score of 0.694059 with std dev of (0.008900) using params: {'penalty': 'l2', 'C': 0.46001000000000003}
Mean score of 0.694094 with std dev of (0.008934) using params: {'penalty': 'l1', 'C': 0.47001000000000004}
Mean score of 0.693940 with std dev of (0.008745) using params: {'penalty': 'l2', 'C': 0.47001000000000004}
Mean score of 0.694161 with std dev of (0.008918) using params: {'penalty': 'l1', 'C': 0.48000999999999999}
Mean score of 0.693940 with std dev of (0.008745) using params: {'penalty': 'l2', 'C': 0.48000999999999999}
Mean score of 0.694220 with std dev of (0.008941) using params: {'penalty': 'l1', 'C': 0.49001}
Mean score of 0.693880 with std dev of (0.008839) using params: {'penalty': 'l2', 'C': 0.49001}

Best estimator: LogisticRegression(C=0.040010000000000004, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

Prediction Outcome :
    Counts                Labels
0    5493               Correct
1    1274   Incorrect: Bad Loan
2    1125  Incorrect: Good Loan

Alternative Model 1: Decision Tree Classifier

Model Tuning

In [37]:
data = 'Training'
model= 'TreeClass'
filename = model+'_'+data+ time.strftime("%Y%m%d-%H%M%S")+ '.xlsx'
best_model, prediction_outcome, model_report= optimise_model(clfs[model],clfs_params[model],X_train, y_train, data, filename)
   
best_models[model] = best_model
prediction_outcome.to_excel(filename)
models_report = models_report.append(model_report, ignore_index = True)
========================================
Computing DecisionTreeClassifier 
========================================
Fitting...
Training time:  57.01074004173279
Best: 0.693898 using {'min_samples_leaf': 150, 'min_samples_split': 100, 'max_depth': 6}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 50, 'min_samples_split': 100, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 50, 'min_samples_split': 200, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 50, 'min_samples_split': 300, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 50, 'min_samples_split': 400, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 50, 'min_samples_split': 500, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 100, 'min_samples_split': 100, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 100, 'min_samples_split': 200, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 100, 'min_samples_split': 300, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 100, 'min_samples_split': 400, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 100, 'min_samples_split': 500, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 150, 'min_samples_split': 100, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 150, 'min_samples_split': 200, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 150, 'min_samples_split': 300, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 150, 'min_samples_split': 400, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 150, 'min_samples_split': 500, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 200, 'min_samples_split': 100, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 200, 'min_samples_split': 200, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 200, 'min_samples_split': 300, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 200, 'min_samples_split': 400, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 200, 'min_samples_split': 500, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 250, 'min_samples_split': 100, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 250, 'min_samples_split': 200, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 250, 'min_samples_split': 300, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 250, 'min_samples_split': 400, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 250, 'min_samples_split': 500, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 300, 'min_samples_split': 100, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 300, 'min_samples_split': 200, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 300, 'min_samples_split': 300, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 300, 'min_samples_split': 400, 'max_depth': 2}
Mean score of 0.650381 with std dev of (0.018007) using params: {'min_samples_leaf': 300, 'min_samples_split': 500, 'max_depth': 2}
Mean score of 0.688829 with std dev of (0.009753) using params: {'min_samples_leaf': 50, 'min_samples_split': 100, 'max_depth': 6}
Mean score of 0.690928 with std dev of (0.007269) using params: {'min_samples_leaf': 50, 'min_samples_split': 200, 'max_depth': 6}
Mean score of 0.692313 with std dev of (0.007344) using params: {'min_samples_leaf': 50, 'min_samples_split': 300, 'max_depth': 6}
Mean score of 0.687077 with std dev of (0.010285) using params: {'min_samples_leaf': 50, 'min_samples_split': 400, 'max_depth': 6}
Mean score of 0.685232 with std dev of (0.010755) using params: {'min_samples_leaf': 50, 'min_samples_split': 500, 'max_depth': 6}
Mean score of 0.689058 with std dev of (0.009452) using params: {'min_samples_leaf': 100, 'min_samples_split': 100, 'max_depth': 6}
Mean score of 0.689058 with std dev of (0.009452) using params: {'min_samples_leaf': 100, 'min_samples_split': 200, 'max_depth': 6}
Mean score of 0.690507 with std dev of (0.009694) using params: {'min_samples_leaf': 100, 'min_samples_split': 300, 'max_depth': 6}
Mean score of 0.687172 with std dev of (0.011105) using params: {'min_samples_leaf': 100, 'min_samples_split': 400, 'max_depth': 6}
Mean score of 0.685621 with std dev of (0.010118) using params: {'min_samples_leaf': 100, 'min_samples_split': 500, 'max_depth': 6}
Mean score of 0.693898 with std dev of (0.009437) using params: {'min_samples_leaf': 150, 'min_samples_split': 100, 'max_depth': 6}
Mean score of 0.693898 with std dev of (0.009437) using params: {'min_samples_leaf': 150, 'min_samples_split': 200, 'max_depth': 6}
Mean score of 0.693898 with std dev of (0.009437) using params: {'min_samples_leaf': 150, 'min_samples_split': 300, 'max_depth': 6}
Mean score of 0.690771 with std dev of (0.010640) using params: {'min_samples_leaf': 150, 'min_samples_split': 400, 'max_depth': 6}
Mean score of 0.687417 with std dev of (0.008873) using params: {'min_samples_leaf': 150, 'min_samples_split': 500, 'max_depth': 6}
Mean score of 0.689157 with std dev of (0.008894) using params: {'min_samples_leaf': 200, 'min_samples_split': 100, 'max_depth': 6}
Mean score of 0.689157 with std dev of (0.008894) using params: {'min_samples_leaf': 200, 'min_samples_split': 200, 'max_depth': 6}
Mean score of 0.689157 with std dev of (0.008894) using params: {'min_samples_leaf': 200, 'min_samples_split': 300, 'max_depth': 6}
Mean score of 0.689157 with std dev of (0.008894) using params: {'min_samples_leaf': 200, 'min_samples_split': 400, 'max_depth': 6}
Mean score of 0.686902 with std dev of (0.008196) using params: {'min_samples_leaf': 200, 'min_samples_split': 500, 'max_depth': 6}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 100, 'max_depth': 6}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 200, 'max_depth': 6}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 300, 'max_depth': 6}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 400, 'max_depth': 6}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 500, 'max_depth': 6}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 100, 'max_depth': 6}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 200, 'max_depth': 6}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 300, 'max_depth': 6}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 400, 'max_depth': 6}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 500, 'max_depth': 6}
Mean score of 0.688443 with std dev of (0.013028) using params: {'min_samples_leaf': 50, 'min_samples_split': 100, 'max_depth': 10}
Mean score of 0.690285 with std dev of (0.009231) using params: {'min_samples_leaf': 50, 'min_samples_split': 200, 'max_depth': 10}
Mean score of 0.690114 with std dev of (0.007709) using params: {'min_samples_leaf': 50, 'min_samples_split': 300, 'max_depth': 10}
Mean score of 0.684653 with std dev of (0.008109) using params: {'min_samples_leaf': 50, 'min_samples_split': 400, 'max_depth': 10}
Mean score of 0.682855 with std dev of (0.009196) using params: {'min_samples_leaf': 50, 'min_samples_split': 500, 'max_depth': 10}
Mean score of 0.690331 with std dev of (0.011049) using params: {'min_samples_leaf': 100, 'min_samples_split': 100, 'max_depth': 10}
Mean score of 0.690331 with std dev of (0.011049) using params: {'min_samples_leaf': 100, 'min_samples_split': 200, 'max_depth': 10}
Mean score of 0.689628 with std dev of (0.009867) using params: {'min_samples_leaf': 100, 'min_samples_split': 300, 'max_depth': 10}
Mean score of 0.685570 with std dev of (0.009356) using params: {'min_samples_leaf': 100, 'min_samples_split': 400, 'max_depth': 10}
Mean score of 0.683938 with std dev of (0.009373) using params: {'min_samples_leaf': 100, 'min_samples_split': 500, 'max_depth': 10}
Mean score of 0.692849 with std dev of (0.009038) using params: {'min_samples_leaf': 150, 'min_samples_split': 100, 'max_depth': 10}
Mean score of 0.692849 with std dev of (0.009038) using params: {'min_samples_leaf': 150, 'min_samples_split': 200, 'max_depth': 10}
Mean score of 0.692849 with std dev of (0.009038) using params: {'min_samples_leaf': 150, 'min_samples_split': 300, 'max_depth': 10}
Mean score of 0.689272 with std dev of (0.009168) using params: {'min_samples_leaf': 150, 'min_samples_split': 400, 'max_depth': 10}
Mean score of 0.685659 with std dev of (0.008980) using params: {'min_samples_leaf': 150, 'min_samples_split': 500, 'max_depth': 10}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 100, 'max_depth': 10}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 200, 'max_depth': 10}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 300, 'max_depth': 10}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 400, 'max_depth': 10}
Mean score of 0.685144 with std dev of (0.008201) using params: {'min_samples_leaf': 200, 'min_samples_split': 500, 'max_depth': 10}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 100, 'max_depth': 10}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 200, 'max_depth': 10}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 300, 'max_depth': 10}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 400, 'max_depth': 10}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 500, 'max_depth': 10}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 100, 'max_depth': 10}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 200, 'max_depth': 10}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 300, 'max_depth': 10}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 400, 'max_depth': 10}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 500, 'max_depth': 10}
Mean score of 0.688109 with std dev of (0.013115) using params: {'min_samples_leaf': 50, 'min_samples_split': 100, 'max_depth': 14}
Mean score of 0.690285 with std dev of (0.009231) using params: {'min_samples_leaf': 50, 'min_samples_split': 200, 'max_depth': 14}
Mean score of 0.690114 with std dev of (0.007709) using params: {'min_samples_leaf': 50, 'min_samples_split': 300, 'max_depth': 14}
Mean score of 0.684653 with std dev of (0.008109) using params: {'min_samples_leaf': 50, 'min_samples_split': 400, 'max_depth': 14}
Mean score of 0.682855 with std dev of (0.009196) using params: {'min_samples_leaf': 50, 'min_samples_split': 500, 'max_depth': 14}
Mean score of 0.690331 with std dev of (0.011049) using params: {'min_samples_leaf': 100, 'min_samples_split': 100, 'max_depth': 14}
Mean score of 0.690331 with std dev of (0.011049) using params: {'min_samples_leaf': 100, 'min_samples_split': 200, 'max_depth': 14}
Mean score of 0.689628 with std dev of (0.009867) using params: {'min_samples_leaf': 100, 'min_samples_split': 300, 'max_depth': 14}
Mean score of 0.685570 with std dev of (0.009356) using params: {'min_samples_leaf': 100, 'min_samples_split': 400, 'max_depth': 14}
Mean score of 0.683938 with std dev of (0.009373) using params: {'min_samples_leaf': 100, 'min_samples_split': 500, 'max_depth': 14}
Mean score of 0.692849 with std dev of (0.009038) using params: {'min_samples_leaf': 150, 'min_samples_split': 100, 'max_depth': 14}
Mean score of 0.692849 with std dev of (0.009038) using params: {'min_samples_leaf': 150, 'min_samples_split': 200, 'max_depth': 14}
Mean score of 0.692849 with std dev of (0.009038) using params: {'min_samples_leaf': 150, 'min_samples_split': 300, 'max_depth': 14}
Mean score of 0.689272 with std dev of (0.009168) using params: {'min_samples_leaf': 150, 'min_samples_split': 400, 'max_depth': 14}
Mean score of 0.685659 with std dev of (0.008980) using params: {'min_samples_leaf': 150, 'min_samples_split': 500, 'max_depth': 14}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 100, 'max_depth': 14}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 200, 'max_depth': 14}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 300, 'max_depth': 14}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 400, 'max_depth': 14}
Mean score of 0.685144 with std dev of (0.008201) using params: {'min_samples_leaf': 200, 'min_samples_split': 500, 'max_depth': 14}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 100, 'max_depth': 14}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 200, 'max_depth': 14}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 300, 'max_depth': 14}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 400, 'max_depth': 14}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 500, 'max_depth': 14}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 100, 'max_depth': 14}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 200, 'max_depth': 14}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 300, 'max_depth': 14}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 400, 'max_depth': 14}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 500, 'max_depth': 14}
Mean score of 0.688443 with std dev of (0.013028) using params: {'min_samples_leaf': 50, 'min_samples_split': 100, 'max_depth': 18}
Mean score of 0.690285 with std dev of (0.009231) using params: {'min_samples_leaf': 50, 'min_samples_split': 200, 'max_depth': 18}
Mean score of 0.690114 with std dev of (0.007709) using params: {'min_samples_leaf': 50, 'min_samples_split': 300, 'max_depth': 18}
Mean score of 0.684653 with std dev of (0.008109) using params: {'min_samples_leaf': 50, 'min_samples_split': 400, 'max_depth': 18}
Mean score of 0.682855 with std dev of (0.009196) using params: {'min_samples_leaf': 50, 'min_samples_split': 500, 'max_depth': 18}
Mean score of 0.690331 with std dev of (0.011049) using params: {'min_samples_leaf': 100, 'min_samples_split': 100, 'max_depth': 18}
Mean score of 0.690331 with std dev of (0.011049) using params: {'min_samples_leaf': 100, 'min_samples_split': 200, 'max_depth': 18}
Mean score of 0.689628 with std dev of (0.009867) using params: {'min_samples_leaf': 100, 'min_samples_split': 300, 'max_depth': 18}
Mean score of 0.685570 with std dev of (0.009356) using params: {'min_samples_leaf': 100, 'min_samples_split': 400, 'max_depth': 18}
Mean score of 0.683938 with std dev of (0.009373) using params: {'min_samples_leaf': 100, 'min_samples_split': 500, 'max_depth': 18}
Mean score of 0.692849 with std dev of (0.009038) using params: {'min_samples_leaf': 150, 'min_samples_split': 100, 'max_depth': 18}
Mean score of 0.692849 with std dev of (0.009038) using params: {'min_samples_leaf': 150, 'min_samples_split': 200, 'max_depth': 18}
Mean score of 0.692849 with std dev of (0.009038) using params: {'min_samples_leaf': 150, 'min_samples_split': 300, 'max_depth': 18}
Mean score of 0.689272 with std dev of (0.009168) using params: {'min_samples_leaf': 150, 'min_samples_split': 400, 'max_depth': 18}
Mean score of 0.685659 with std dev of (0.008980) using params: {'min_samples_leaf': 150, 'min_samples_split': 500, 'max_depth': 18}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 100, 'max_depth': 18}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 200, 'max_depth': 18}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 300, 'max_depth': 18}
Mean score of 0.686881 with std dev of (0.007429) using params: {'min_samples_leaf': 200, 'min_samples_split': 400, 'max_depth': 18}
Mean score of 0.685144 with std dev of (0.008201) using params: {'min_samples_leaf': 200, 'min_samples_split': 500, 'max_depth': 18}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 100, 'max_depth': 18}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 200, 'max_depth': 18}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 300, 'max_depth': 18}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 400, 'max_depth': 18}
Mean score of 0.682980 with std dev of (0.012259) using params: {'min_samples_leaf': 250, 'min_samples_split': 500, 'max_depth': 18}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 100, 'max_depth': 18}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 200, 'max_depth': 18}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 300, 'max_depth': 18}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 400, 'max_depth': 18}
Mean score of 0.681860 with std dev of (0.013378) using params: {'min_samples_leaf': 300, 'min_samples_split': 500, 'max_depth': 18}

Best estimator: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=150, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Prediction Outcome :
    Counts                Labels
0    5596               Correct
1    1228   Incorrect: Bad Loan
2    1068  Incorrect: Good Loan

Alternative Model 2: Gradient Boosting Classifier

Model Tuning

In [38]:
data = 'Training'
model= 'GradBoost'
filename = model+'_'+data+ time.strftime("%Y%m%d-%H%M%S")+ '.xlsx'
best_model, prediction_outcome, model_report= optimise_model(clfs[model],clfs_params[model],X_train, y_train, data, filename)
   
best_models[model] = best_model
prediction_outcome.to_excel(filename)
models_report = models_report.append(model_report, ignore_index = True)
========================================
Computing GradientBoostingClassifier 
========================================
Fitting...
Training time:  1904.7577140331268
Best: 0.711585 using {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.691711 with std dev of (0.009801) using params: {'n_estimators': 100, 'min_samples_leaf': 50, 'learning_rate': 0.01, 'max_depth': 2}
Mean score of 0.704421 with std dev of (0.009934) using params: {'n_estimators': 300, 'min_samples_leaf': 50, 'learning_rate': 0.01, 'max_depth': 2}
Mean score of 0.710070 with std dev of (0.008936) using params: {'n_estimators': 500, 'min_samples_leaf': 50, 'learning_rate': 0.01, 'max_depth': 2}
Mean score of 0.691376 with std dev of (0.009566) using params: {'n_estimators': 100, 'min_samples_leaf': 150, 'learning_rate': 0.01, 'max_depth': 2}
Mean score of 0.704151 with std dev of (0.008894) using params: {'n_estimators': 300, 'min_samples_leaf': 150, 'learning_rate': 0.01, 'max_depth': 2}
Mean score of 0.708314 with std dev of (0.008066) using params: {'n_estimators': 500, 'min_samples_leaf': 150, 'learning_rate': 0.01, 'max_depth': 2}
Mean score of 0.691162 with std dev of (0.009684) using params: {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.01, 'max_depth': 2}
Mean score of 0.703723 with std dev of (0.009205) using params: {'n_estimators': 300, 'min_samples_leaf': 250, 'learning_rate': 0.01, 'max_depth': 2}
Mean score of 0.708380 with std dev of (0.008195) using params: {'n_estimators': 500, 'min_samples_leaf': 250, 'learning_rate': 0.01, 'max_depth': 2}
Mean score of 0.702987 with std dev of (0.010053) using params: {'n_estimators': 100, 'min_samples_leaf': 50, 'learning_rate': 0.01, 'max_depth': 5}
Mean score of 0.708088 with std dev of (0.010796) using params: {'n_estimators': 300, 'min_samples_leaf': 50, 'learning_rate': 0.01, 'max_depth': 5}
Mean score of 0.706868 with std dev of (0.009181) using params: {'n_estimators': 500, 'min_samples_leaf': 50, 'learning_rate': 0.01, 'max_depth': 5}
Mean score of 0.701450 with std dev of (0.008844) using params: {'n_estimators': 100, 'min_samples_leaf': 150, 'learning_rate': 0.01, 'max_depth': 5}
Mean score of 0.708295 with std dev of (0.009570) using params: {'n_estimators': 300, 'min_samples_leaf': 150, 'learning_rate': 0.01, 'max_depth': 5}
Mean score of 0.707427 with std dev of (0.009116) using params: {'n_estimators': 500, 'min_samples_leaf': 150, 'learning_rate': 0.01, 'max_depth': 5}
Mean score of 0.699028 with std dev of (0.010474) using params: {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.01, 'max_depth': 5}
Mean score of 0.709201 with std dev of (0.010480) using params: {'n_estimators': 300, 'min_samples_leaf': 250, 'learning_rate': 0.01, 'max_depth': 5}
Mean score of 0.708684 with std dev of (0.009717) using params: {'n_estimators': 500, 'min_samples_leaf': 250, 'learning_rate': 0.01, 'max_depth': 5}
Mean score of 0.701960 with std dev of (0.009941) using params: {'n_estimators': 100, 'min_samples_leaf': 50, 'learning_rate': 0.01, 'max_depth': 8}
Mean score of 0.705140 with std dev of (0.009319) using params: {'n_estimators': 300, 'min_samples_leaf': 50, 'learning_rate': 0.01, 'max_depth': 8}
Mean score of 0.704432 with std dev of (0.009863) using params: {'n_estimators': 500, 'min_samples_leaf': 50, 'learning_rate': 0.01, 'max_depth': 8}
Mean score of 0.700759 with std dev of (0.008300) using params: {'n_estimators': 100, 'min_samples_leaf': 150, 'learning_rate': 0.01, 'max_depth': 8}
Mean score of 0.706241 with std dev of (0.009279) using params: {'n_estimators': 300, 'min_samples_leaf': 150, 'learning_rate': 0.01, 'max_depth': 8}
Mean score of 0.705497 with std dev of (0.008549) using params: {'n_estimators': 500, 'min_samples_leaf': 150, 'learning_rate': 0.01, 'max_depth': 8}
Mean score of 0.699542 with std dev of (0.011190) using params: {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.01, 'max_depth': 8}
Mean score of 0.709331 with std dev of (0.010363) using params: {'n_estimators': 300, 'min_samples_leaf': 250, 'learning_rate': 0.01, 'max_depth': 8}
Mean score of 0.708696 with std dev of (0.009733) using params: {'n_estimators': 500, 'min_samples_leaf': 250, 'learning_rate': 0.01, 'max_depth': 8}
Mean score of 0.710064 with std dev of (0.010271) using params: {'n_estimators': 100, 'min_samples_leaf': 50, 'learning_rate': 0.21, 'max_depth': 2}
Mean score of 0.707213 with std dev of (0.007574) using params: {'n_estimators': 300, 'min_samples_leaf': 50, 'learning_rate': 0.21, 'max_depth': 2}
Mean score of 0.704192 with std dev of (0.010002) using params: {'n_estimators': 500, 'min_samples_leaf': 50, 'learning_rate': 0.21, 'max_depth': 2}
Mean score of 0.709604 with std dev of (0.007569) using params: {'n_estimators': 100, 'min_samples_leaf': 150, 'learning_rate': 0.21, 'max_depth': 2}
Mean score of 0.703937 with std dev of (0.010186) using params: {'n_estimators': 300, 'min_samples_leaf': 150, 'learning_rate': 0.21, 'max_depth': 2}
Mean score of 0.703205 with std dev of (0.008184) using params: {'n_estimators': 500, 'min_samples_leaf': 150, 'learning_rate': 0.21, 'max_depth': 2}
Mean score of 0.710053 with std dev of (0.008235) using params: {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.21, 'max_depth': 2}
Mean score of 0.707064 with std dev of (0.009172) using params: {'n_estimators': 300, 'min_samples_leaf': 250, 'learning_rate': 0.21, 'max_depth': 2}
Mean score of 0.705721 with std dev of (0.009436) using params: {'n_estimators': 500, 'min_samples_leaf': 250, 'learning_rate': 0.21, 'max_depth': 2}
Mean score of 0.697775 with std dev of (0.011744) using params: {'n_estimators': 100, 'min_samples_leaf': 50, 'learning_rate': 0.21, 'max_depth': 5}
Mean score of 0.687078 with std dev of (0.010814) using params: {'n_estimators': 300, 'min_samples_leaf': 50, 'learning_rate': 0.21, 'max_depth': 5}
Mean score of 0.676125 with std dev of (0.010681) using params: {'n_estimators': 500, 'min_samples_leaf': 50, 'learning_rate': 0.21, 'max_depth': 5}
Mean score of 0.699230 with std dev of (0.008262) using params: {'n_estimators': 100, 'min_samples_leaf': 150, 'learning_rate': 0.21, 'max_depth': 5}
Mean score of 0.691637 with std dev of (0.006627) using params: {'n_estimators': 300, 'min_samples_leaf': 150, 'learning_rate': 0.21, 'max_depth': 5}
Mean score of 0.686981 with std dev of (0.007388) using params: {'n_estimators': 500, 'min_samples_leaf': 150, 'learning_rate': 0.21, 'max_depth': 5}
Mean score of 0.704429 with std dev of (0.007564) using params: {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.21, 'max_depth': 5}
Mean score of 0.695232 with std dev of (0.006875) using params: {'n_estimators': 300, 'min_samples_leaf': 250, 'learning_rate': 0.21, 'max_depth': 5}
Mean score of 0.690068 with std dev of (0.005554) using params: {'n_estimators': 500, 'min_samples_leaf': 250, 'learning_rate': 0.21, 'max_depth': 5}
Mean score of 0.689842 with std dev of (0.010155) using params: {'n_estimators': 100, 'min_samples_leaf': 50, 'learning_rate': 0.21, 'max_depth': 8}
Mean score of 0.676321 with std dev of (0.011389) using params: {'n_estimators': 300, 'min_samples_leaf': 50, 'learning_rate': 0.21, 'max_depth': 8}
Mean score of 0.669869 with std dev of (0.012228) using params: {'n_estimators': 500, 'min_samples_leaf': 50, 'learning_rate': 0.21, 'max_depth': 8}
Mean score of 0.695471 with std dev of (0.009265) using params: {'n_estimators': 100, 'min_samples_leaf': 150, 'learning_rate': 0.21, 'max_depth': 8}
Mean score of 0.680896 with std dev of (0.009455) using params: {'n_estimators': 300, 'min_samples_leaf': 150, 'learning_rate': 0.21, 'max_depth': 8}
Mean score of 0.674377 with std dev of (0.010906) using params: {'n_estimators': 500, 'min_samples_leaf': 150, 'learning_rate': 0.21, 'max_depth': 8}
Mean score of 0.697049 with std dev of (0.007918) using params: {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.21, 'max_depth': 8}
Mean score of 0.688549 with std dev of (0.008936) using params: {'n_estimators': 300, 'min_samples_leaf': 250, 'learning_rate': 0.21, 'max_depth': 8}
Mean score of 0.681107 with std dev of (0.009453) using params: {'n_estimators': 500, 'min_samples_leaf': 250, 'learning_rate': 0.21, 'max_depth': 8}
Mean score of 0.711480 with std dev of (0.010201) using params: {'n_estimators': 100, 'min_samples_leaf': 50, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.709903 with std dev of (0.011939) using params: {'n_estimators': 300, 'min_samples_leaf': 50, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.708520 with std dev of (0.010534) using params: {'n_estimators': 500, 'min_samples_leaf': 50, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.711016 with std dev of (0.009016) using params: {'n_estimators': 100, 'min_samples_leaf': 150, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.708438 with std dev of (0.009341) using params: {'n_estimators': 300, 'min_samples_leaf': 150, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.708923 with std dev of (0.010097) using params: {'n_estimators': 500, 'min_samples_leaf': 150, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.711585 with std dev of (0.009416) using params: {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.710282 with std dev of (0.009420) using params: {'n_estimators': 300, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.708263 with std dev of (0.009844) using params: {'n_estimators': 500, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 2}
Mean score of 0.705416 with std dev of (0.009253) using params: {'n_estimators': 100, 'min_samples_leaf': 50, 'learning_rate': 0.1, 'max_depth': 5}
Mean score of 0.697422 with std dev of (0.009561) using params: {'n_estimators': 300, 'min_samples_leaf': 50, 'learning_rate': 0.1, 'max_depth': 5}
Mean score of 0.692261 with std dev of (0.010459) using params: {'n_estimators': 500, 'min_samples_leaf': 50, 'learning_rate': 0.1, 'max_depth': 5}
Mean score of 0.703542 with std dev of (0.008197) using params: {'n_estimators': 100, 'min_samples_leaf': 150, 'learning_rate': 0.1, 'max_depth': 5}
Mean score of 0.699404 with std dev of (0.006793) using params: {'n_estimators': 300, 'min_samples_leaf': 150, 'learning_rate': 0.1, 'max_depth': 5}
Mean score of 0.695204 with std dev of (0.007784) using params: {'n_estimators': 500, 'min_samples_leaf': 150, 'learning_rate': 0.1, 'max_depth': 5}
Mean score of 0.707539 with std dev of (0.009926) using params: {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 5}
Mean score of 0.701556 with std dev of (0.008392) using params: {'n_estimators': 300, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 5}
Mean score of 0.699423 with std dev of (0.006263) using params: {'n_estimators': 500, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 5}
Mean score of 0.700662 with std dev of (0.007969) using params: {'n_estimators': 100, 'min_samples_leaf': 50, 'learning_rate': 0.1, 'max_depth': 8}
Mean score of 0.689352 with std dev of (0.010317) using params: {'n_estimators': 300, 'min_samples_leaf': 50, 'learning_rate': 0.1, 'max_depth': 8}
Mean score of 0.682077 with std dev of (0.010872) using params: {'n_estimators': 500, 'min_samples_leaf': 50, 'learning_rate': 0.1, 'max_depth': 8}
Mean score of 0.700679 with std dev of (0.008758) using params: {'n_estimators': 100, 'min_samples_leaf': 150, 'learning_rate': 0.1, 'max_depth': 8}
Mean score of 0.693304 with std dev of (0.009076) using params: {'n_estimators': 300, 'min_samples_leaf': 150, 'learning_rate': 0.1, 'max_depth': 8}
Mean score of 0.687611 with std dev of (0.008796) using params: {'n_estimators': 500, 'min_samples_leaf': 150, 'learning_rate': 0.1, 'max_depth': 8}
Mean score of 0.703441 with std dev of (0.009364) using params: {'n_estimators': 100, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 8}
Mean score of 0.697149 with std dev of (0.007637) using params: {'n_estimators': 300, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 8}
Mean score of 0.692953 with std dev of (0.007623) using params: {'n_estimators': 500, 'min_samples_leaf': 250, 'learning_rate': 0.1, 'max_depth': 8}

Best estimator: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=250, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

Prediction Outcome :
    Counts                Labels
0    5731               Correct
1    1131  Incorrect: Good Loan
2    1030   Incorrect: Bad Loan

Model Evaluation

In [39]:
data = 'Testing'
model= 'LogReg'
filename = model+'_'+data+ time.strftime("%Y%m%d-%H%M%S")+ '.xlsx'

prediction_outcome, model_report= model_predict(best_models[model],X_test, y_test, data, filename)
prediction_outcome.to_excel(filename)
models_report = models_report.append(model_report, ignore_index = True)
========================================
Computing LogisticRegression 
========================================

Predicting...
Prediction time:  0.015403509140014648
10-fold cross validation average G-mean: 0.702

Prediction Outcome :
    Counts                Labels
0    1362               Correct
1     326   Incorrect: Bad Loan
2     285  Incorrect: Good Loan
In [40]:
data = 'Testing'
model= 'TreeClass'
filename = model+'_'+data+ time.strftime("%Y%m%d-%H%M%S")+ '.xlsx'

prediction_outcome, model_report= model_predict(best_models[model],X_test, y_test, data, filename)
prediction_outcome.to_excel(filename)
models_report = models_report.append(model_report, ignore_index = True)
========================================
Computing DecisionTreeClassifier 
========================================

Predicting...
Prediction time:  0.0
10-fold cross validation average G-mean: 0.658

Prediction Outcome :
    Counts                Labels
0    1363               Correct
1     315   Incorrect: Bad Loan
2     295  Incorrect: Good Loan
In [41]:
data = 'Testing'
model= 'GradBoost'
filename = model+'_'+data+ time.strftime("%Y%m%d-%H%M%S")+ '.xlsx'

prediction_outcome, model_report= model_predict(best_models[model],X_test, y_test, data, filename)
prediction_outcome.to_excel(filename)
models_report = models_report.append(model_report, ignore_index = True)
========================================
Computing GradientBoostingClassifier 
========================================

Predicting...
Prediction time:  0.012991666793823242
10-fold cross validation average G-mean: 0.707

Prediction Outcome :
    Counts                Labels
0    1412               Correct
1     299  Incorrect: Good Loan
2     262   Incorrect: Bad Loan

Evaluation of models and final model selection

In [42]:
display(models_report)
outputfile = 'Model_Report_' + time.strftime("%Y%m%d")+ '.xlsx'
models_report.to_excel(outputfile, index = False)
Model Dataset Best Params File Name True Positive False Positive True Negative False Negative Sensitivity Specificity G_Mean
0 LogisticRegression Training {'penalty': 'l1', 'C': 0.04001} LogReg_Training20180826-010456.xlsx 2678.0 1274.0 2815.0 1125.0 0.704 0.688 0.696
1 DecisionTreeClassifier Training {'min_samples_leaf': 150, 'min_samples_split':... TreeClass_Training20180826-010631.xlsx 2735.0 1228.0 2861.0 1068.0 0.719 0.700 0.709
2 GradientBoostingClassifier Training {'n_estimators': 100, 'min_samples_leaf': 250,... GradBoost_Training20180826-010731.xlsx 2672.0 1030.0 3059.0 1131.0 0.703 0.748 0.725
3 LogisticRegression Testing {'multi_class': 'ovr', 'verbose': 0, 'penalty'... LogReg_Testing20180826-013922.xlsx 645.0 326.0 717.0 285.0 0.694 0.687 0.690
4 DecisionTreeClassifier Testing {'max_features': None, 'min_weight_fraction_le... TreeClass_Testing20180826-013924.xlsx 635.0 315.0 728.0 295.0 0.683 0.698 0.690
5 GradientBoostingClassifier Testing {'min_weight_fraction_leaf': 0.0, 'presort': '... GradBoost_Testing20180826-013925.xlsx 631.0 262.0 781.0 299.0 0.678 0.749 0.713

Model Interpretation

Global Model Interpretation

Feature Importance

In [43]:
def print_coefs(coefs, training_columns):
    sorted_idx = np.argsort(-abs(coefs)) # negative for descending
    for (name, coef) in zip(np.array(training_columns)[sorted_idx], coefs[sorted_idx]):
        print("%s: %f" % (name, coef))

def plot_coefs(coefs, training_columns, title_suffix=''):
    sorted_idx = np.argsort(abs(coefs))
    pos = np.arange(len(coefs)) + .5
    plt.subplot(1, 2, 2)
    plt.barh(pos, coefs[sorted_idx], align='center')
    plt.yticks(pos, np.array(training_columns)[sorted_idx])
    plt.xlabel('Coef value')
    plt.title('Feature Importance' + title_suffix)
    plt.xlim(0, 0.5)
    plt.show()

Decision Tree Classifier

In [44]:
model='TreeClass'
importances = best_models[model].feature_importances_

print_coefs(importances,list(X_train.columns.values) )
plot_coefs(importances,list(X_train.columns.values) )
NetFractionRevolvingBurden: 0.486692
AverageMInFile: 0.195821
MSinceMostRecentInqexcl7days: 0.148487
NumTrades60Ever2DerogPubRec: 0.129505
NumSatisfactoryTrades: 0.029375
MSinceOldestTradeOpen: 0.006822
NumBank2NatlTradesWHighUtilization: 0.002438
NetFractionInstallBurden: 0.000859

Gradient Boost

In [45]:
model='GradBoost'
importances = best_models[model].feature_importances_

print_coefs(importances,list(X_train.columns.values) )
plot_coefs(importances,list(X_train.columns.values) )
NetFractionRevolvingBurden: 0.206595
MSinceMostRecentInqexcl7days: 0.205278
NumSatisfactoryTrades: 0.141555
AverageMInFile: 0.134602
NumBank2NatlTradesWHighUtilization: 0.133808
NumTrades60Ever2DerogPubRec: 0.099691
MSinceOldestTradeOpen: 0.041961
NetFractionInstallBurden: 0.036511

Local Model Interpretation

Local Interpretable Model-Agnostic Explanations (LIME)

In [46]:
import random
import lime
import lime.lime_tabular
from __future__ import print_function
import warnings
warnings.filterwarnings('ignore')

def local_exp(model, train_set, testinstance, num_feats):
    print('='*40)
    print('Model: {} '.format(model.__class__.__name__))
    print('='*40)

    # create explainer
    explainer = lime.lime_tabular.LimeTabularExplainer(train_set.values, feature_names=list(train_set.columns.values), class_names=['Good (0)','Bad (1)'], discretize_continuous=True)
    
    # explain instance
    exp = explainer.explain_instance(testinstance, model.predict_proba, num_features=num_feats)
    
    # show results
    exp.show_in_notebook(show_table=True, show_all=True)

    return True

Correct Classification

Good Loan

In [47]:
#test_instance_index =random.choice(X_test.index.values.tolist())
test_instance_index = 2506
# 3986 bad loan classified as good
# 6705 bad loan classified as bad
# 8050 good loan classified as bad
print('Loan\n\tIndex: {} \t Actual Outcome: {} \n'.format(test_instance_index, y_test.loc[test_instance_index]))
print('Attributes: ')
print(X_test.loc[test_instance_index])

Explain = local_exp(best_models['GradBoost'],X_train, X_test.loc[test_instance_index].values,5)
    
Loan
	Index: 2506 	 Actual Outcome: 0 

Attributes: 
MSinceOldestTradeOpen                 117
AverageMInFile                         57
NetFractionRevolvingBurden              0
NetFractionInstallBurden               -8
NumBank2NatlTradesWHighUtilization      0
NumSatisfactoryTrades                  29
NumTrades60Ever2DerogPubRec             0
MSinceMostRecentInqexcl7days            0
Name: 2506, dtype: int64
========================================
Model: GradientBoostingClassifier 
========================================

Correct Classification

Bad Loan

In [48]:
#test_instance_index =random.choice(X_test.index.values.tolist())
test_instance_index = 6705
# 3986 bad loan classified as good
# 6705 bad loan classified as bad
# 8050 good loan classified as bad
# 2506 good loan classified as good
print('Loan\n\tIndex: {} \t Actual Outcome: {} \n'.format(test_instance_index, y_test.loc[test_instance_index]))
print('Attributes: ')
print(X_test.loc[test_instance_index])

Explain = local_exp(best_models['GradBoost'],X_train, X_test.loc[test_instance_index].values,5)
    
Loan
	Index: 6705 	 Actual Outcome: 1 

Attributes: 
MSinceOldestTradeOpen                 76
AverageMInFile                        42
NetFractionRevolvingBurden            48
NetFractionInstallBurden              95
NumBank2NatlTradesWHighUtilization     0
NumSatisfactoryTrades                  6
NumTrades60Ever2DerogPubRec            1
MSinceMostRecentInqexcl7days           0
Name: 6705, dtype: int64
========================================
Model: GradientBoostingClassifier 
========================================

Inorrect Classification

Good Loan classified as Bad

In [49]:
#test_instance_index =random.choice(X_test.index.values.tolist())
test_instance_index = 449
print('Loan\n\tIndex: {} \t Actual Outcome: {} \n'.format(test_instance_index, y_test.loc[test_instance_index]))
print('Attributes: ')
print(X_test.loc[test_instance_index])

Explain = local_exp(best_models['GradBoost'],X_train, X_test.loc[test_instance_index].values,5)
    
Loan
	Index: 449 	 Actual Outcome: 0 

Attributes: 
MSinceOldestTradeOpen                 156
AverageMInFile                         51
NetFractionRevolvingBurden             31
NetFractionInstallBurden               69
NumBank2NatlTradesWHighUtilization      0
NumSatisfactoryTrades                  14
NumTrades60Ever2DerogPubRec             0
MSinceMostRecentInqexcl7days            0
Name: 449, dtype: int64
========================================
Model: GradientBoostingClassifier 
========================================

Inorrect Classification

Bad Loan classified as Good

In [50]:
test_instance_index =random.choice(X_test.index.values.tolist())
test_instance_index = 5221

print('Loan\n\tIndex: {} \t Actual Outcome: {} \n'.format(test_instance_index, y_test.loc[test_instance_index]))
print('Attributes: ')
print(X_test.loc[test_instance_index])

Explain = local_exp(best_models['GradBoost'],X_train, X_test.loc[test_instance_index].values,5)
    
Loan
	Index: 5221 	 Actual Outcome: 1 

Attributes: 
MSinceOldestTradeOpen                 168
AverageMInFile                         85
NetFractionRevolvingBurden             40
NetFractionInstallBurden               95
NumBank2NatlTradesWHighUtilization      0
NumSatisfactoryTrades                  23
NumTrades60Ever2DerogPubRec             0
MSinceMostRecentInqexcl7days            0
Name: 5221, dtype: int64
========================================
Model: GradientBoostingClassifier 
========================================